library(mice)
library(tidyverse)
train <- read.csv("../clean_data/mci_wv1go.csv")
test <- read.csv("../clean_data/mci_wv23.csv")
Overall missing patterns of different waves are quite different. I will impute the train & test set seperately to avoid information leakage
df_bar <- data.frame(variable=names(train), train=colMeans(is.na(train)), test=colMeans(is.na(test)) ) %>%
pivot_longer(cols=c("train", "test"), names_to = "set", values_to = "Missingness")
ggplot(df_bar) +
geom_bar(aes(x=reorder(variable, desc(Missingness)), y=Missingness, fill=set), position="dodge", stat = "identity")+
xlab("Feature") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
# md.pattern(train, rotate.names = T)
# md.pattern(test, rotate.names = T)
fluxplot(train)
fluxplot(test)
pred = quickpred(train)
imp = mice(train, seed=1, m=5, maxit=5, pred=pred, printFlag = F, ridge=0.001)
## Warning: Number of logged events: 1444
# > # default error: probably due to linear combination according to the warning
# > set higher ridge to address collinearity
# imp$method # all use pmm
# diagnostic:
bwplot(imp, layout = c(3, 1))
stripplot(imp, pch = c(21, 20), cex = c(1, 1.5), layout = c(3, 1))
# export:
train_imp <- complete(imp, m=5)[[1]]
write.csv(train_imp, file = "../clean_data/mci_wv1go_imp.csv")
pred = quickpred(test)
imp = mice(test, seed=1, m=5, maxit=5, pred=pred, printFlag = F, ridge=0.001)
## Warning: Number of logged events: 1211
# > # default error: probably due to linear combination according to the warning
# > set higher ridge to address collinearity
# imp$method # all use pmm
# diagnostic:
bwplot(imp, layout = c(3, 1))
stripplot(imp, pch = c(21, 20), cex = c(1, 1.5), layout = c(3, 1))